#!/usr/bin/awk -f

# Copyright (C) 2002 Matthias S. Benkmann <m.s.b@gmx.net>
#
#This program is free software; you can redistribute it and/or
#modify it under the terms of the GNU General Public License
#as published by the Free Software Foundation; version 2
#of the License (ONLY THIS VERSION).
#
#This program is distributed in the hope that it will be useful,
#but WITHOUT ANY WARRANTY; without even the implied warranty of
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#GNU General Public License for more details.
#
#You should have received a copy of the GNU General Public License
#along with this program; if not, write to the Free Software
#Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.


#empty tag:  {tag}
#<tag>foo</tag>:   tag{foo}
#<tag attr="bar">foo</tag>:   tag{attr[bar]foo}
#                          or tag{foo attr[bar]}
# bar<tag>foo</tag>:  bar tag{foo}
#
#Notes:-Literal "{","[" in a context where they could be mistaken for
#       tags or attributes must be written as "&#123;","&#91;"
#      -If tag or attribute name is preceded by a space, this space is deleted
#       (only a single space character, not the whole whitespace)
#      -a {!DOCTYPE .... EPYTCOD!} block may precede the 1st tag that is
#       transformed into a <!DOCTYPE ... >
#      -if you pass the parameter "--tag-paras", then every sequence of
#       non-empty lines surrounded by empty lines will be wrapped in
#       <paragraph>...</paragraph> unless that would cause improper nesting
#      -if you pass the parameter "--no-attr" then attributes are not
#       supported (freeing up "[" and "]" for normal use)
#      -if you pass the parameter "--no-chewspace", then the chewing of a
#       single space before "attr[" and "tag{" is disabled.
#      -the default encoding is iso-8859-1. To use a different one pass
#       "--encoding=<encoding>"

BEGIN {
  NON_WHITESPACE_CHAR_re="([^\1-\x20])"
  WHITESPACE_re="[\1-\x20]*"
  XMLTAG_re="([a-zA-Z_:][a-zA-Z_:0-9.-]*)"
  XMLSTART_re="( ?" XMLTAG_re "{)"
  XMLEND_re= "}"
  BRACE_re= "{"
  DOCTYPESTART_re="({!DOCTYPE)"
  DOCTYPEEND_re="(EPYTCOD!})"
  DOCTYPE_re="(" DOCTYPESTART_re WHITESPACE_re ".*" DOCTYPEEND_re ")"
  ATTR_re=XMLTAG_re
  ATTRSTART_re="( ?" ATTR_re "\\[)"
  ATTREND_re= "\\]"
  ENTITYREF_re="((&#[0-9]+;)|(&#x[0-9a-fA-F]+;)|(&[a-zA-Z][a-zA-Z0-9]*;))"
  EOL_re="[\n]"
  EMPTYLINE_re="^"WHITESPACE_re EOL_re
  NONEMPTYLINE_re="^[^\xA\xD]*" NON_WHITESPACE_CHAR_re"[^\xA\xD]*" EOL_re

  STOP_re="(\"|" ENTITYREF_re "|" EOL_re "|" BRACE_re "|" XMLSTART_re "|" XMLEND_re "|" ATTRSTART_re "|" ATTREND_re "|<|&)"
  
  top=0
  outtop=0
  line=1
  tagparas=0
  chewspace=1
  encoding="iso-8859-1"
  noattr=0
  emptylinefound=0
  debug=0
}  

function process(      A,rest,rstart,rlength,scanstart,found,tagstack,attrstack,idx,litstart,litlength,insideattr,attrname,attrstartidx,attrtagoutidx,attrstartline,tagline,backstep)
{
  rest=$0
  scanstart=1
  litstart=1
  litlength=0
  insideattr=0
  while (match(rest,STOP_re))
  {
    found=substr(rest,RSTART,RLENGTH)
    if (debug) { print "process(): found=\"" found "\"" >"/dev/stderr" }
    rstart=RSTART
    rlength=RLENGTH
    litlength+=rstart-1
    rest=substr(rest,rstart+rlength)
    scanstart=scanstart+rstart-1+rlength
    
    if (chewspace==0 && substr(found,1,1)==" " && 
       (found ~ ATTRSTART_re || found ~ XMLSTART_re)) 
    {
      ++litlength
      --rlength
      ++rstart
      found=substr(found,2)
    }  
    outcmd[outtop]="lit"
    outdata[outtop]=litstart","litlength
    ++outtop
    litstart=scanstart
    litlength=0
    
    backstep=1
    
    if (found ~ ENTITYREF_re)
    {
      litlength+=rlength
      litstart-=rlength
      backstep=0
    }
    else
    if (found ~ "<")
    {
      outcmd[outtop]="<"
      ++outtop
      backstep=0
    }
    else
    if (found ~ "&")
    {
      outcmd[outtop]="&"
      ++outtop
      backstep=0
    }
    else
    if (found ~ EOL_re )
    {
      ++line
      backstep=1
      if (tagparas && !insideattr)
      {
        if (match(rest,NONEMPTYLINE_re))
        {
          if (emptylinefound)
          {
            split(outdata[outtop-1],A,",")
            outdata[outtop-1]=A[1]","(A[2]+rlength)
            outcmd[outtop]="parastart"
            outdata[outtop]=""
            ++outtop
            emptylinefound=0
            backstep=0
          }  
        }
        else if (match(rest,EMPTYLINE_re)) 
        {
          if (!emptylinefound)
          {
            split(outdata[outtop-1],A,",")
            outdata[outtop-1]=A[1]","(A[2]+rlength)
            outcmd[outtop]="paraend"
            outdata[outtop]=""
            ++outtop
            emptylinefound=1
            backstep=0
          }  
        }
      }  
    }
    else
    {
      if (!insideattr)
      {
        if (found ~ XMLSTART_re)
        {
          match(found,XMLTAG_re)
          found=substr(found,RSTART,RLENGTH)
          tagline[top]=line
          tagstack[top]=found
          tagoutidx[top]=outtop
          outcmd[outtop]="starttag"
          outdata[outtop]=found
          ++top
          ++outtop
          backstep=0
        }
        else
        if (found ~ XMLEND_re)
        {
          if (top==0)
          {
            print "ERROR(line "line"): } without {tag"  >"/dev/stderr"
            exit 1
          }
          
          top=top-1
          if (tagstack[top]=="{")
            outcmd[outtop]="}"
          else
          {
            outcmd[outtop]="endtag"
            outdata[outtop]=tagstack[top]
          }  
          ++outtop
          backstep=0
        }
        else
        if (noattr==0 && found ~ ATTRSTART_re)
        {
          if (top==0)
          {
            print "ERROR(line "line"): [ not inside tag"  >"/dev/stderr"
            exit 1
          }
          
          match(found,ATTR_re)
          found=substr(found,RSTART,RLENGTH)
          attrstartline=line
          attrname=found
          attrstartidx=outtop
          attrtagoutidx=tagoutidx[top-1]
          insideattr=1
          backstep=0
        }
        else
        if (found ~ BRACE_re)
        {
          tagline[top]=line
          tagstack[top]="{"
          tagoutidx[top]=outtop
          outcmd[outtop]="{"
          ++top
          ++outtop
          backstep=0
        }
      }
      else #if (insideattr)
      {
        if (found ~ ATTREND_re)
        {
          outdata[attrtagoutidx]=outdata[attrtagoutidx]" "attrname"=\""outString(attrstartidx,0)"\""
          outtop=attrstartidx
          insideattr=0
          backstep=0
        }
        else
        if (found ~ "\"")
        {
          outcmd[outtop]="\""
          ++outtop
          backstep=0
        }
      }
    }  
    
    if (backstep)
    {
      --outtop
      split(outdata[outtop],A,",")
      litstart=A[1]
      litlength=A[2]+rlength
    }  
    
  }
  
  if (insideattr)
  {
    print "ERROR(line "attrstartline"): Attribute "attrname" not closed"  >"/dev/stderr"
    exit 1
  }
  
  if (top!=0)
  {
    print "ERROR(line "tagline[top-1]"): Tag "tagstack[top-1]" not closed"  >"/dev/stderr"
    exit 1
  }
}

function outString(startidx,doparas,   i,j,count,res,A,inpara)
{
  res=""
  inpara=0
  for(i=startidx; i<outtop; ++i)
  {
#    print (i outcmd[i]":"outdata[i])
    if (outcmd[i]=="lit")
    {
      split(outdata[i],A,",")
      res=res substr($0,A[1],A[2])
    }
    else if (outcmd[i]=="&")
    {
      res=res "&amp;"
    }
    else if (outcmd[i]=="<")
    {
      res=res "&lt;"
    }
    else if (outcmd[i]=="{")
    {
      res=res "{"
    }
    else if (outcmd[i]=="}")
    {
      res=res "}"
    }
    else if (outcmd[i]=="\"")
    {
      res=res "&quot;"
    }
    else if (outcmd[i]=="starttag")
    {
      res=res "<"outdata[i]">"
    }
    else if (outcmd[i]=="endtag")
    {
      res=res "</"outdata[i]">"
    }
    else if (outcmd[i]=="parastart")
    {
      if (tagparas && doparas)
      {
        count=0
        for (j=i; j<outtop && outcmd[j]!="paraend"; ++j)
        {
          if (outcmd[j]=="starttag") ++count
          if (outcmd[j]=="endtag") --count
          if (count<0) break
        }

        if (j<outtop && count==0)
        {
          inpara=1
          res=res"<paragraph>\n"
        }
#        else print "rejected "j","count
      }
    }
    else if (outcmd[i]=="paraend")
    {
      if (tagparas && doparas)
      {
        if (inpara)
        {
          inpara=0
          res=res"</paragraph>"
        }
      }
    }
    else
    {
      print "ERROR(line "line"): Internal error (Unknown command:"outcmd[i]")"  >"/dev/stderr"
      exit 1
    }
#    print "res="res
  }
  return res;
}

function parse_commandline(   i)
{
  i=1
  while (i<ARGC)
  {
    if (ARGV[i]=="--tag-paras")
    {
      tagparas=1
    }
    else if (ARGV[i]=="--no-attr")
    {
      noattr=1
    }
    else if (ARGV[i]=="--no-chewspace")
    {
      chewspace=0
    }
    else if (match(ARGV[i],/^--encoding=/))
    {
      encoding=substr(ARGV[i],RLENGTH+1)
    }
    
    ++i 
  }
}



BEGIN {
  RS="ThisStRinGNEvErOCCUrsiNinPUt"
  FS="ThisStRinGNEvErOCCUrsiNinPUt"
  parse_commandline()
  getline
  print "<?xml version='1.0' encoding='" encoding "'?>"
  if (match($0,"^"WHITESPACE_re DOCTYPE_re))
  {
    found=substr($0,RSTART,RLENGTH)
    $0=substr($0,RSTART+RLENGTH)
    match(found,DOCTYPESTART_re)
    found=substr(found,RSTART+RLENGTH)
    match(found,DOCTYPEEND_re)
    found=substr(found,1,RSTART-1)
    print "<!DOCTYPE "found">"
  }
  process()
  print outString(0,1)
}

